Sys.Date()
## [1] "2020-12-18"

Authors: Hanna Klimczak, Kamil Pluciński

Libraries used

library(plotly)
library(knitr)
library(kableExtra)
library(caret)

Providing data reproductibility

To ensure that running the notebook will always return the same output, we set seed to 0.

set.seed(0)

Reading data from file

data <- read.csv("Life_Expectancy_Data.csv")
kable(head(data), "html") %>% kable_styling("striped") %>% scroll_box(width = "100%")
Country Year Status Life.expectancy Adult.Mortality infant.deaths Alcohol percentage.expenditure Hepatitis.B Measles BMI under.five.deaths Polio Total.expenditure Diphtheria HIV.AIDS GDP Population thinness..1.19.years thinness.5.9.years Income.composition.of.resources Schooling
Afghanistan 2015 Developing 65.0 263 62 0.01 71.279624 65 1154 19.1 83 6 8.16 65 0.1 584.25921 33736494 17.2 17.3 0.479 10.1
Afghanistan 2014 Developing 59.9 271 64 0.01 73.523582 62 492 18.6 86 58 8.18 62 0.1 612.69651 327582 17.5 17.5 0.476 10.0
Afghanistan 2013 Developing 59.9 268 66 0.01 73.219243 64 430 18.1 89 62 8.13 64 0.1 631.74498 31731688 17.7 17.7 0.470 9.9
Afghanistan 2012 Developing 59.5 272 69 0.01 78.184215 67 2787 17.6 93 67 8.52 67 0.1 669.95900 3696958 17.9 18.0 0.463 9.8
Afghanistan 2011 Developing 59.2 275 71 0.01 7.097109 68 3013 17.2 97 68 7.87 68 0.1 63.53723 2978599 18.2 18.2 0.454 9.5
Afghanistan 2010 Developing 58.8 279 74 0.01 79.679367 66 1989 16.7 102 66 9.20 66 0.1 553.32894 2883167 18.4 18.4 0.448 9.2

Dataset summary and basic statistics

nrow(data)
## [1] 2938
kable(summary(data), "html") %>% kable_styling("striped") %>% scroll_box(width = "100%")
Country Year Status Life.expectancy Adult.Mortality infant.deaths Alcohol percentage.expenditure Hepatitis.B Measles BMI under.five.deaths Polio Total.expenditure Diphtheria HIV.AIDS GDP Population thinness..1.19.years thinness.5.9.years Income.composition.of.resources Schooling
Length:2938 Min. :2000 Length:2938 Min. :36.30 Min. : 1.0 Min. : 0.0 Min. : 0.0100 Min. : 0.000 Min. : 1.00 Min. : 0.0 Min. : 1.00 Min. : 0.00 Min. : 3.00 Min. : 0.370 Min. : 2.00 Min. : 0.100 Min. : 1.68 Min. :3.400e+01 Min. : 0.10 Min. : 0.10 Min. :0.0000 Min. : 0.00
Class :character 1st Qu.:2004 Class :character 1st Qu.:63.10 1st Qu.: 74.0 1st Qu.: 0.0 1st Qu.: 0.8775 1st Qu.: 4.685 1st Qu.:77.00 1st Qu.: 0.0 1st Qu.:19.30 1st Qu.: 0.00 1st Qu.:78.00 1st Qu.: 4.260 1st Qu.:78.00 1st Qu.: 0.100 1st Qu.: 463.94 1st Qu.:1.958e+05 1st Qu.: 1.60 1st Qu.: 1.50 1st Qu.:0.4930 1st Qu.:10.10
Mode :character Median :2008 Mode :character Median :72.10 Median :144.0 Median : 3.0 Median : 3.7550 Median : 64.913 Median :92.00 Median : 17.0 Median :43.50 Median : 4.00 Median :93.00 Median : 5.755 Median :93.00 Median : 0.100 Median : 1766.95 Median :1.387e+06 Median : 3.30 Median : 3.30 Median :0.6770 Median :12.30
NA Mean :2008 NA Mean :69.22 Mean :164.8 Mean : 30.3 Mean : 4.6029 Mean : 738.251 Mean :80.94 Mean : 2419.6 Mean :38.32 Mean : 42.04 Mean :82.55 Mean : 5.938 Mean :82.32 Mean : 1.742 Mean : 7483.16 Mean :1.275e+07 Mean : 4.84 Mean : 4.87 Mean :0.6276 Mean :11.99
NA 3rd Qu.:2012 NA 3rd Qu.:75.70 3rd Qu.:228.0 3rd Qu.: 22.0 3rd Qu.: 7.7025 3rd Qu.: 441.534 3rd Qu.:97.00 3rd Qu.: 360.2 3rd Qu.:56.20 3rd Qu.: 28.00 3rd Qu.:97.00 3rd Qu.: 7.492 3rd Qu.:97.00 3rd Qu.: 0.800 3rd Qu.: 5910.81 3rd Qu.:7.420e+06 3rd Qu.: 7.20 3rd Qu.: 7.20 3rd Qu.:0.7790 3rd Qu.:14.30
NA Max. :2015 NA Max. :89.00 Max. :723.0 Max. :1800.0 Max. :17.8700 Max. :19479.912 Max. :99.00 Max. :212183.0 Max. :87.30 Max. :2500.00 Max. :99.00 Max. :17.600 Max. :99.00 Max. :50.600 Max. :119172.74 Max. :1.294e+09 Max. :27.70 Max. :28.60 Max. :0.9480 Max. :20.70
NA NA NA NA’s :10 NA’s :10 NA NA’s :194 NA NA’s :553 NA NA’s :34 NA NA’s :19 NA’s :226 NA’s :19 NA NA’s :448 NA’s :652 NA’s :34 NA’s :34 NA’s :167 NA’s :163

Dealing with missing values

As we can see from the summary above, there are some missing values in the dataset. Due to the fact that life.expectancy is the most important attribute for our analysis, we have decided to remove all rows where life.expectancy is NA.

data_new <- data[complete.cases(data[ , 'Life.expectancy']),]

After this operation, we still encounter missing values in the following columns: “Alcohol”, “Hepatitis.B”, “BMI”, “Polio”, “Total.expenditure”, “Diphtheria”, “GDP”, “Population”, “thinness..1.19.years”, “thinness.5.9.years”, “Income.composition.of.resources”, “Schooling”. We will fill these NA values with median for each column.

na_columns <- list("Alcohol", "Hepatitis.B", "BMI", "Polio", "Total.expenditure", "Diphtheria", "GDP", "Population", "thinness..1.19.years", "thinness.5.9.years", "Income.composition.of.resources", "Schooling")

for (col in na_columns){
    m <- median(data_new[ , col], na.rm=TRUE)
    print(col)
    print(m)
    data_new[ , col][is.na(data_new[ , col])] <- m
}
## [1] "Alcohol"
## [1] 3.77
## [1] "Hepatitis.B"
## [1] 92
## [1] "BMI"
## [1] 43.35
## [1] "Polio"
## [1] 93
## [1] "Total.expenditure"
## [1] 5.75
## [1] "Diphtheria"
## [1] 93
## [1] "GDP"
## [1] 1764.974
## [1] "Population"
## [1] 1391757
## [1] "thinness..1.19.years"
## [1] 3.3
## [1] "thinness.5.9.years"
## [1] 3.4
## [1] "Income.composition.of.resources"
## [1] 0.677
## [1] "Schooling"
## [1] 12.3
summary(data_new)
##    Country               Year         Status          Life.expectancy
##  Length:2928        Min.   :2000   Length:2928        Min.   :36.30  
##  Class :character   1st Qu.:2004   Class :character   1st Qu.:63.10  
##  Mode  :character   Median :2008   Mode  :character   Median :72.10  
##                     Mean   :2008                      Mean   :69.22  
##                     3rd Qu.:2011                      3rd Qu.:75.70  
##                     Max.   :2015                      Max.   :89.00  
##  Adult.Mortality infant.deaths        Alcohol       percentage.expenditure
##  Min.   :  1.0   Min.   :   0.00   Min.   : 0.010   Min.   :    0.000     
##  1st Qu.: 74.0   1st Qu.:   0.00   1st Qu.: 1.107   1st Qu.:    4.854     
##  Median :144.0   Median :   3.00   Median : 3.770   Median :   65.611     
##  Mean   :164.8   Mean   :  30.41   Mean   : 4.559   Mean   :  740.321     
##  3rd Qu.:228.0   3rd Qu.:  22.00   3rd Qu.: 7.400   3rd Qu.:  442.614     
##  Max.   :723.0   Max.   :1800.00   Max.   :17.870   Max.   :19479.912     
##   Hepatitis.B       Measles              BMI        under.five.deaths
##  Min.   : 1.00   Min.   :     0.0   Min.   : 1.00   Min.   :   0.00  
##  1st Qu.:82.00   1st Qu.:     0.0   1st Qu.:19.40   1st Qu.:   0.00  
##  Median :92.00   Median :    17.0   Median :43.35   Median :   4.00  
##  Mean   :83.05   Mean   :  2427.9   Mean   :38.29   Mean   :  42.18  
##  3rd Qu.:96.00   3rd Qu.:   362.2   3rd Qu.:56.10   3rd Qu.:  28.00  
##  Max.   :99.00   Max.   :212183.0   Max.   :77.60   Max.   :2500.00  
##      Polio       Total.expenditure   Diphtheria       HIV.AIDS     
##  Min.   : 3.00   Min.   : 0.370    Min.   : 2.00   Min.   : 0.100  
##  1st Qu.:78.00   1st Qu.: 4.370    1st Qu.:78.00   1st Qu.: 0.100  
##  Median :93.00   Median : 5.750    Median :93.00   Median : 0.100  
##  Mean   :82.62   Mean   : 5.916    Mean   :82.39   Mean   : 1.748  
##  3rd Qu.:97.00   3rd Qu.: 7.330    3rd Qu.:97.00   3rd Qu.: 0.800  
##  Max.   :99.00   Max.   :17.600    Max.   :99.00   Max.   :50.600  
##       GDP              Population        thinness..1.19.years
##  Min.   :     1.68   Min.   :3.400e+01   Min.   : 0.100      
##  1st Qu.:   578.80   1st Qu.:4.181e+05   1st Qu.: 1.600      
##  Median :  1764.97   Median :1.392e+06   Median : 3.300      
##  Mean   :  6627.39   Mean   :1.026e+07   Mean   : 4.834      
##  3rd Qu.:  4793.63   3rd Qu.:4.593e+06   3rd Qu.: 7.100      
##  Max.   :119172.74   Max.   :1.294e+09   Max.   :27.700      
##  thinness.5.9.years Income.composition.of.resources   Schooling    
##  Min.   : 0.100     Min.   :0.0000                  Min.   : 0.00  
##  1st Qu.: 1.600     1st Qu.:0.5040                  1st Qu.:10.30  
##  Median : 3.400     Median :0.6770                  Median :12.30  
##  Mean   : 4.865     Mean   :0.6301                  Mean   :12.02  
##  3rd Qu.: 7.200     3rd Qu.:0.7730                  3rd Qu.:14.10  
##  Max.   :28.600     Max.   :0.9480                  Max.   :20.70

As we can see, we have successfully dealt with missing values.

In-depth analysis of attributes (i.e. value distribution)

names(data_new)
##  [1] "Country"                         "Year"                           
##  [3] "Status"                          "Life.expectancy"                
##  [5] "Adult.Mortality"                 "infant.deaths"                  
##  [7] "Alcohol"                         "percentage.expenditure"         
##  [9] "Hepatitis.B"                     "Measles"                        
## [11] "BMI"                             "under.five.deaths"              
## [13] "Polio"                           "Total.expenditure"              
## [15] "Diphtheria"                      "HIV.AIDS"                       
## [17] "GDP"                             "Population"                     
## [19] "thinness..1.19.years"            "thinness.5.9.years"             
## [21] "Income.composition.of.resources" "Schooling"
quantitive_cols = c("Year", "Life.expectancy", "Adult.Mortality", "infant.deaths", "Alcohol", "percentage.expenditure", "Hepatitis.B", "Measles", "BMI", "under.five.deaths", "Polio", "Total.expenditure", "Diphtheria","HIV.AIDS","GDP","Population", "thinness..1.19.years","thinness.5.9.years",     "Income.composition.of.resources", "Schooling")
l <- htmltools::tagList()

i <- 1
for (col in quantitive_cols){
  l[[i]] <- plot_ly(y = data_new[, col], type = "box",name=col, quartilemethod="exclusive")
  i <- i + 1
}

l
p1 <- plot_ly(data_new, x = ~Status) %>%
  add_histogram()

p1

Corelation between variables (graphic presentation)

correlation_data <- data_new[quantitive_cols]

fig <- plot_ly(
    x = quantitive_cols,
    y = quantitive_cols,
    z = cor(correlation_data), type = "heatmap"
)

fig

The above chart gives us important information about the correlation between variables. As we can see, there is almost perfect correlation between thinness.1.19.years and thinness.5.9.years. There also appears to be a very strong correlation between GDP and percentage.expenditure (0.9), as well as infant.deaths and under.five.deaths (0.99). Naturally, we can also see a strong negative correlation between Adult.Mortality and Life.expectancy (-0.69). Schooling and life expectancy seem to be slightly correlated as well (0.71).

For our prediction, we will need to drop some of the features that are highly correlated. We will drop thinness.5.9.years, percentage.expenditure and infant.deaths, as their correlation to our decision variable is weaker than features correlated to them.

Interavtive plot for average life duration per country with respect to year

The following plot is fully interactive - hover over selected line to see a specific value, select one country by double clicking on it in the legend. If you want to return to the view of all countries - double click on the legend again.

countries <- unique(data_new$Country)
fig <- plot_ly(data, x = data_new[data_new$Country == 'Afghanistan', 'Year'])

for(name in countries){
  fig <- fig %>% add_trace(y = data_new[data_new$Country == name, 'Life.expectancy'], name = name, type = 'scatter', mode = 'lines') 
}

fig <- fig %>% layout(legend = list(orientation = 'h'))


fig

Data preparation

regression_data <- data_new[, !names(data_new) %in% c("Country", "thinness.5.9.years", "percentage.expenditure", "infant.deaths", "Year")]


kable(head(regression_data), "html") %>% kable_styling("striped") %>% scroll_box(width = "100%")
Status Life.expectancy Adult.Mortality Alcohol Hepatitis.B Measles BMI under.five.deaths Polio Total.expenditure Diphtheria HIV.AIDS GDP Population thinness..1.19.years Income.composition.of.resources Schooling
Developing 65.0 263 0.01 65 1154 19.1 83 6 8.16 65 0.1 584.25921 33736494 17.2 0.479 10.1
Developing 59.9 271 0.01 62 492 18.6 86 58 8.18 62 0.1 612.69651 327582 17.5 0.476 10.0
Developing 59.9 268 0.01 64 430 18.1 89 62 8.13 64 0.1 631.74498 31731688 17.7 0.470 9.9
Developing 59.5 272 0.01 67 2787 17.6 93 67 8.52 67 0.1 669.95900 3696958 17.9 0.463 9.8
Developing 59.2 275 0.01 68 3013 17.2 97 68 7.87 68 0.1 63.53723 2978599 18.2 0.454 9.5
Developing 58.8 279 0.01 66 1989 16.7 102 66 9.20 66 0.1 553.32894 2883167 18.4 0.448 9.2

Regressor for average life duration estimation

Train test valid split

trainval_partition <- 
    createDataPartition(
        y = regression_data$Life.expectancy,
        p = .8,
        list = FALSE)

trainval_data <- regression_data[ trainval_partition,]
test_data  <- regression_data[-trainval_partition,]

train_partition <- 
    createDataPartition(
        y = trainval_data$Life.expectancy,
        p = .8,
        list = FALSE)

train_data <- trainval_data[ train_partition,]
val_data  <- trainval_data[-train_partition,]
control <- trainControl(
    method = "repeatedcv",
    number = 10,
    repeats = 5)
linear <- train(Life.expectancy ~ .,
               data = train_data,
               trControl = control,
               preProcess = c('scale', 'center'),
               method = "lm")
linear
## Linear Regression 
## 
## 1878 samples
##   16 predictor
## 
## Pre-processing: scaled (16), centered (16) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 1690, 1690, 1690, 1690, 1690, 1690, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   4.232504  0.8022808  3.159272
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Model parameter tuning and accuracy calculation using R2 and RMSE measures

lasso <- train(Life.expectancy ~ .,
               data = train_data,
               trControl = control,
               preProcess = c('scale', 'center'),
               method = "lasso")
lasso
## The lasso 
## 
## 1878 samples
##   16 predictor
## 
## Pre-processing: scaled (16), centered (16) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 1690, 1690, 1689, 1690, 1688, 1691, ... 
## Resampling results across tuning parameters:
## 
##   fraction  RMSE      Rsquared   MAE     
##   0.1       8.534207  0.6981532  6.941555
##   0.5       5.498991  0.7556058  4.146438
##   0.9       4.251140  0.8009618  3.155516
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.9.
ridge <- train(Life.expectancy ~ .,
               data = train_data,
               trControl = control,
               preProcess = c('scale', 'center'),
               method = "ridge")
ridge
## Ridge Regression 
## 
## 1878 samples
##   16 predictor
## 
## Pre-processing: scaled (16), centered (16) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 1689, 1691, 1691, 1689, 1689, 1690, ... 
## Resampling results across tuning parameters:
## 
##   lambda  RMSE      Rsquared   MAE     
##   0e+00   4.228416  0.8031364  3.159813
##   1e-04   4.228416  0.8031369  3.159841
##   1e-01   4.275729  0.8030123  3.230414
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 1e-04.

Attribute importance analysis for the best model found

ggplot(varImp(ridge))

What attribute contributes the most to longer or shorter life duration?